/* this dofile does the relatedness analysis.

q: which industries A and B that are connected
in the sense of individuals having excess transition probability from A to B


data file (constructed at server): 
- balanced panel of workers 1996-2006, full time workers all year
- for each nace2 this file gives the transition to nace2 next year
example: 
nace2  n_nace2	nace2_plus	n_nace2_plus
1		10		1			7
1		10		2			2
1		10		3			1
Here there are 10 workers in nace2==1. 7 of these stay in nace2==1 and 3 leave.

key definitions:

relatedness1:
i. f_AB=prob_transition (A to B)
ii. Assign .1 to the 10% B industries with the lowest f_AB, .2 to the next decile and so on. The most common industries for worker to 

relatedness2:
i. f_AB-f_B where f_B is fraction of workers in Sector B.
ii. same as relatedness1

relatedness3:
i. f_AB-f_XB where f_XB is fraction of movers in economy that moves into B.
ii. same as relatedness1

Arguably, relatedness3 is the more appropriate measure. */

*******************
** 0. Globals *****
*******************
global bins 10

*****************
** I. Prelims ***
*****************
** import names to nace matrix file **
use nace_stats_2, clear

** drop 99 and 0 (unknown)
drop if nace2==99|nace2==0
drop if nace2_plus==99|nace2_plus==0

** drop household activities (only a handful of workers)
drop if nace2==95
drop if nace2_plus==95

** fillin for combinations where 0 moves
fillin nace2 nace2_plus yr
tab _fillin
bys yr nace2: egen a=max(n_nace2)
replace n_nace2=a if n_nace2==.
drop a
replace n_nace2_plus=0 if n_nace2_plus==.
replace transition_fraction_2=0 if transition_fraction_2==.
drop _fillin

* nace names
rename nace2 Code
sort Code
merge Code using nace_rev_1_1_2digit, nokeep keep(Description Definition)
tab _merge
drop _merge
rename Code nace2
rename Description nace2_name

rename nace2_plus Code
sort Code
merge Code using nace_rev_1_1_2digit, nokeep keep(Description Definition)
tab _merge
drop _merge
rename Code nace2_plus
rename Description nace2_plus_name

* drop if nace2==nace2_plus  //DO NOT DO THIS DROP AS OWN_TRANSITION VARIABLE DOES NOT WORK
save temp1, replace

************************************************
** II. Relatedness based on worker mobility ****
************************************************

*************************************
** Calculate relatedness, yearly ****  
*************************************
forvalues t=1996/2005 {  //years covered in underlying data is 1996-2006
use if yr==`t' using temp1, clear

** fraction of workforce tomorrow per nace
egen N_plus=sum(n_nace2_plus)  //inds in economy
bys nace2_plus: egen N_nace2_plus=sum(n_nace2_plus)  //inds in nace2 tomorrow
gen frac_nace2_plus=N_nace2_plus/N_plus  //fraction of workers tomorrow in nace2

** movers (for descriptives table)
gen m=n_nace2_plus if nace2!=nace2_plus
replace m=0 if nace2==nace2_plus
bys nace2: egen movers=sum(m)
drop m

preserve  //check. looks ok
duplicates drop nace2_plus, force
egen aaa=sum(frac_nace2_plus)
sum aaa
restore

** transition fractions for those that move  //this one looks ok
gen a=transition_fraction_2 if nace2==nace2_plus
bys nace2: egen own_transition=max(a)
drop a
gen transition_fraction_2_adj=transition_fraction_2/(1-own_transition) //prob moving to that sector conditional on moving
replace transition_fraction_2_adj=. if nace2==nace2_plus
bys nace2: egen m3=sum(transition_fraction_2_adj)
sum m3  //check

** fraction of moves that go into a given industry. looks ok.
gen a=nace2_plus if nace2!=nace2_plus
egen N_movers=sum(a)
bys nace2_plus: egen N_movers_nace2_plus=sum(a)
gen frac_movers_nace2_plus=N_movers_nace2_plus/N_movers

preserve  //check. looks ok.
duplicates drop nace2_plus, force
egen m4=sum(frac_movers_nace2_plus)
sum m4 //check
restore

** excess fraction that moves to nace2_plus (excess compared to nace2_plus fraction of total employment)
gen f1_yearly=transition_fraction_2_adj //fraction of movers that go into industry B
gen f2_yearly=transition_fraction_2_adj-frac_nace2_plus //excess fraction of movers, relative to size of B
gen f3_yearly=transition_fraction_2_adj-frac_movers_nace2_plus //excess fraction of movers, relative to fraction of movers in economy that goes to B

save yearly`t', replace
}

clear
forvalues t=1996/2005 {  //years covered in underlying data is 1996-2006
append using yearly`t'
erase yearly`t'.dta
}

save temp2, replace

*****************************************************
** Calculate relatedness as average across years ****  
*****************************************************
use temp2, clear
bys nace2 nace2_plus: egen f1=mean(f1_yearly)
bys nace2 nace2_plus: egen f2=mean(f2_yearly)
bys nace2 nace2_plus: egen f3=mean(f3_yearly)

drop *yearly  

keep if yr==2002  // this keeps one obs per nace2*nace2_plus combination. because of fillin no gaps.

** construct relatedness xtiles (deciles)
forvalues t=1/3 {
xtileJ r`t'_xtile=f`t', nquantiles($bins) by(nace2)

// variability (not really needed)
bys nace2: egen var_f`t'=var(f`t')
bys nace2: egen mean_f`t'=mean(f`t')
}

keep nace2 nace2_plus f* r* nace2_name nace2_plus_name n_nace2 n_nace2_plus movers
order nace2 nace2_plus f* r* nace2_name nace2_plus_name
sort nace2 nace2_plus
save excess_transition, replace  //file to use for descriptives

rename nace2 p_nace5_2
rename nace2_plus naceSSB_2 //to make consistent with hans_file2.dta
sort p_nace5_2 naceSSB_2
save excess_transition_adapted, replace
